***************
* Title: gambia_ecd_edcc_tableA4-5.do
* Author: Todd Pugatch
* Description: replication code for Blimpo, Carneiro, Jervis, and Pugatch,
*	"Improving Access and Quality in Early Childhood Development Programs: 
*		Experimental Evidence from The Gambia"
*	for Economic Development and Cultural Change
* Inputs: ECD_3to6_Gambia_cleanv1.dta
* Outputs: 	gambia_ecd_edcc_tableA4-5.[txt/xls]
*			gambia_ecd_edcc_tableA4-5[a-d].out
*			gambia_ecd_edcc_tableA4-5_g1.wmf
* Notes: creates Tables A4-A5
****************
#delimit;
local start=`"$S_TIME"';
clear;
clear matrix;
clear mata;
graph drop _all;
cap log close;
set more off;
/*set directory:
	cd mydir
*/
local data=`"Data\cleaned"';
local output=`"analysis\output"';
log using analysis\output\gambia_ecd_edcc_tableA4-5.txt, text replace;

* LOAD AND PREPARE DATA;
* define sample as in gambia_ecd_attrition1.do;
qui use `data'\ECD_3to6_Gambia_cleanv1, clear;
/*NEED TO CHECK TREATMENT STATUS OF SETTLEMENT 37028. DISREGARD FOR NOW*/
qui drop if settlement_code==37028;

* define 3 groups:
	--in baseline
	--in endline (original sample)
	--in endline (newly sampled);
qui gen in_baseline=(ip22!=3 & ip22!=.);	
qui gen in_endline=(interview_result==1);
qui gen in_endline_old=(in_endline==1 & in_baseline==1);
qui gen in_endline_new=(in_endline==1 & in_baseline==0);
/*treat unresolved gender mismatches as new to endline*/
qui replace in_endline_new=1 if in_endline_new==0 & child_gender_mismatch_resolved==0;
qui replace in_endline_old=. if in_endline_new==1;

* repeat for having valid MDAT fine motor and language/hearing scores;
/*note that "in sample" defined as having at least one valid MDAT score, not both as in previous analyses of attrition*/
qui gen in_baseline_mdat=(in_baseline==1 & (zfinemotor_baseline!=.|zlanghear_baseline!=.));
qui replace in_baseline_mdat=. if in_endline_new==1;
foreach x in endline endline_old endline_new {;
	qui gen in_`x'_mdat=(in_`x'==1 & (zfinemotor_endline!=.|zlanghear_endline!=.));
};
qui replace in_endline_old_mdat=. if in_endline_new==1;
qui gen in_endline_old_mdat_base=in_endline_old_mdat;
qui replace in_endline_old_mdat_base=. if in_baseline_mdat!=1; /*in baseline MDAT & in endline MDAT*/
	
/*keep eligibles*/
* keep if baseline age from 3-6 years, or new to endline;
count; 
keep if (child_age_mths_dob>=36 & child_age_mths_dob<84 & in_baseline==1)|(child_age_mths_dob==. & in_baseline==0);

* keep if new to endline and would have been 3-6 at baseline (define as 4-8 at endline to allow errors);
keep if in_endline_new==0|(in_endline_new==1 & selected_child_age>=48 & selected_child_age<=96 & selected_child_age!=.)|
	(in_endline_new==1 & child_gender_mismatch_resolved==0);
	
* baseline data cleaning (follows gambia_ecd_balance3.do);
/*child mother tongue*/
qui gen child_mandinka=(child_mothertongue==1);
qui gen child_fula=(child_mothertongue==2);
qui gen child_jola=(child_mothertongue==4);
qui gen child_serahule=(child_mothertongue==5);
qui gen child_other=(child_mothertongue==3|child_mothertongue>=6); /*includes missing*/
foreach x in mandinka fula jola serahule other {;
	lab var child_`x' "child's mother tongue is `x'";
};

/*hours worked by HH head & mother*/
foreach x in headhh mother {;
	qui gen `x'_hrswrk_topcode=`x'_hrswrk;
	qui replace `x'_hrswrk_topcode=80 if `x'_hrswrk>80 & `x'_hrswrk!=.; /*top-code hours worked at 80*/
	qui tab `x'_hrswrk_cat, mi gen(`x'_hrswrk_cat);
};

/*winsorize annual per-capita expenditure below 1st and above 99th percentiles*/
foreach x in expend_yr_hhpc expend_yr_hhpc_usd {;
	winsor `x', gen(`x'_wins) p(0.01);
};
qui gen ecd_wtppctw_baseline=ecd_wtpq_baseline*12/expend_yr_hhpc_wins;
lab var expend_yr_hhpc_wins "Annual HH expenditure per capita, GMD, winsorized at 1/99 percentiles";
lab var expend_yr_hhpc_usd_wins "Annual HH expenditure per capita, USD, winsorized at 1/99 percentiles";
lab var ecd_wtppctw_baseline "Amount willing to pay for ECD as shr of monthly HH pc expenditure (winsorized) at baseline";

* endline data cleaning;
/*hours worked by HH head & mother*/
foreach x in headhh mother {;
	qui gen `x'_hrswrk_topcode_el=`x'_hrswrk_el;
	qui replace `x'_hrswrk_topcode_el=80 if `x'_hrswrk_el>80 & `x'_hrswrk_el!=.; /*top-code hours worked at 80*/
	qui tab `x'_hrswrk_cat_el, mi gen(`x'_hrswrk_cat_el);
};

/*winsorize annual per-capita expenditure below 1st and above 99th percentiles*/
foreach x in expend_yr_hhpc expend_yr_hhpc_usd {;
	winsor `x'_el, gen(`x'_wins_el) p(0.01);
};
lab var expend_yr_hhpc_wins_el "Annual HH expenditure per capita, GMD, winsorized at 1/99 percentiles, endline";
lab var expend_yr_hhpc_usd_wins_el "Annual HH expenditure per capita, USD, winsorized at 1/99 percentiles, endline";

* ANALYSIS OF ENDLINE RESULTS;
* prep sample;
qui drop if in_endline==0;
qui gen purecontrol=(treatment==1);
qui gen ECDAnnex_control=(treatment==4);
qui gen ECDAnnex_treated=(treatment==5);
qui gen communitybased=(treatment==6);
qui gen ECDAnnex=ECDAnnex_control+ECDAnnex_treated;
qui gen communitybased_all=communitybased+purecontrol;
qui gen experiment=(ECDAnnex==1);
lab def experiment 0 "community-based" 1 "ECD Annex";
lab val experiment experiment;

* normalize endline MDAT scores to combined control group distribution;
/*adjust for age and gender, following normalization of gambia_ecd_clean1.do*/
drop zfinemotor_adj_endline zlanghear_adj_endline;
foreach x in finemotor langhear {;
	qui xi: reg `x'_endline selected_child_age selected_child_age2 child_female 
		if purecontrol==1|ECDAnnex_control==1;
	qui predict e, residuals; 
	qui su e if purecontrol==1|ECDAnnex_control==1;
	qui gen z`x'_adj_endline=(e-r(mean))/r(sd);
	drop e;
};
lab var zfinemotor_adj_endline "MDAT fine motor endline z-score (age-adjusted)";
lab var zlanghear_adj_endline "MDAT language & hearing endline z-score (age-adjusted)";

/*endline outcomes*/
local childY "zfinemotor_endline zlanghear_endline zfinemotor_adj_endline zlanghear_adj_endline haz_endline 
	child_height_endline child_weight_endline childBMI_endline childBMI_under_endline childBMI_over_endline 
	child_armcirm_endline child_bednet_endline child_sick_endline";
local headhhY "headhh_work_el headhh_hrswrk_topcode_el headhh_hrswrk_cat_el1 headhh_hrswrk_cat_el2 headhh_hrswrk_cat_el3 
	headhh_hrswrk_cat_el4 headhh_hrswrk_cat_el5 headhh_ag_el expend_yr_hhpc_usd_wins_el";
local motherY "mother_work_el mother_hrswrk_topcode_el mother_hrswrk_cat_el1 mother_hrswrk_cat_el2 mother_hrswrk_cat_el3 
	mother_hrswrk_cat_el4 mother_hrswrk_cat_el5 mother_ag_el mother_unpaid_el mother_health1_endline mother_health_endline";
local caregiverY "disc_nonvlntpcti_endline disc_psychaggpcti_endline disc_vlntpcti_endline discsevere_nonvlntpcti_endline 
	discsevere_psychaggpcti_endline discsevere_vlntpcti_endline play_items interaction_intensity interaction_play";
local motherknowledge "MK2_food_hygiene MK2_hands_washing MK2_waste_management MK2_drinking_water MK2_hygiene_sanitation 
	MK2_danger_sign MK2_deworming MK2_health MK2_health_g MK2_vitamin_A MK2_iodized_salt MK2_supplementary_food 
	MK2_nutrition_g MK2_nutrition MK2_child_development mother_knowledge1";

* CALCULATE PROPENSITY SCORE;
* limit analysis to those in baseline, because (im)balance should be based on baseline characteristics;
qui drop if in_baseline==0;

/*impute zeroes for missing, then include dummies for this imputation to ensure all observations get a score*/
local X "child_age_mths_dob child_female region2 ecd_attend_baseline zfinemotor_adj_baseline zlanghear_adj_baseline 
	haz_dob_baseline hh_total mother_schl_yrs expend_yr_hhpc_usd_wins ecd_wtppctw_baseline vaccinepct_baseline 
	mother_health_baseline";	
foreach x in `X' {;
	qui gen `x'p=`x';
	qui replace `x'p=0 if `x'==.; /*impute zeroes where missing*/
	qui gen `x'm=(`x'==.);
	lab var `x'm "`x' imputed to zero because missing";
};	
local Xp "child_age_mths_dobp child_femalep region2p ecd_attend_baselinep zfinemotor_adj_baselinep zlanghear_adj_baselinep 
	haz_dob_baselinep hh_totalp mother_schl_yrsp expend_yr_hhpc_usd_winsp ecd_wtppctw_baselinep vaccinepct_baselinep 
	mother_health_baselinep";	
local Xm "child_age_mths_dobm child_femalem region2m ecd_attend_baselinem zfinemotor_adj_baselinem zlanghear_adj_baselinem 
	haz_dob_baselinem hh_totalm mother_schl_yrsm expend_yr_hhpc_usd_winsm ecd_wtppctw_baselinem vaccinepct_baselinem 
	mother_health_baselinem";	
logit ECDAnnex `Xp' `Xm';
qui predict p, pr;
	
* check overlap in distribution & define common support;
/*code follows agsp_reg20.do*/
su p if ECDAnnex==1, d;
su p if ECDAnnex==0, d;
qui gen common_support=1;							// Tag observations that have common support
qui su p if ECDAnnex==1;
qui replace common_support=0 if ECDAnnex==0 & (p>r(max)|p<r(min));
qui su p if ECDAnnex==0;
qui replace common_support=0 if ECDAnnex==1 & (p>r(max)|p<r(min));

/*how many sites fall within common support?*/
tab ECDAnnex common_support, mi;
tab ECDAnnex common_support if in_endline_mdat==1, mi;
twoway (kdensity p if ECDAnnex==1, lpattern(l) lcolor(blue) lwidth(thick))
	(kdensity p if ECDAnnex==0, lpattern(_) lcolor(red) lwidth(thick)),
	title("Propensity score") ytitle("density") xtitle("Pr(treatment)") 
	legend(label(1 "ECD Annex") label(2 "community-based ECD") rows(1))
	nodraw name(g1);
graph display g1;
*qui graph export `output'\gambia_ecd_edcc_tableA4-5_g1.wmf, as(wmf) replace;

* sample sizes (# of children and project sites);
qui egen site=tag(settlement_code);
table experiment, c(freq rawsum site);
table experiment if common_support==1, c(freq rawsum site);
table experiment if common_support==1, 
	c(rawsum in_baseline_mdat rawsum in_endline_mdat rawsum in_endline_old_mdat rawsum in_endline_new_mdat);
	
* define weights;
/*See Imbens & Wooldridge (JEL 2009), section 5.6:
	--without rescaling: if outcome equation correctly specified, pscore equation can be misspecified and outcome 
		estimator is still consistent (robustness 1). Cost is efficiency, b/c these weights won't match optimal 
		weighting on efficiency grounds.
	--with rescaling: if pscore equation correctly specified, outcome equation can be misspecified and weights 
		w/ rescaling still gives consistent estimate of treatment effects (robustness 2).*/
/*unscaled weights*/
qui gen w=1/p if ECDAnnex==1 & common_support==1;
qui replace w=1/(1-p) if ECDAnnex==0 & common_support==1;
		
/*rescaled weights: total propensity score, separately for treatment and control*/
qui bysort ECDAnnex: egen scale=total(p);
qui gen w_scaled=w/scale;	

* BALANCE TESTS;
* check if weighting resolves balance issues;
* follow code in gambia_ecd_results20.do, but focusing on subsets of MDAT items;
* use regressions to get means, standard errors, differences, and p-values;
* must use regression to get standard errors with inversely weighted by p-scores, because "orth_out"
	doesn't have this option;
* define set of X's: these should differ from those used in model for propensity score in order to be a 
	true test;
local X "zfinemotor_baseline zlanghear_baseline zfinemotor_adj_baseline zlanghear_adj_baseline 
	langhear_name_pct_baseline langhear_sentence_baseline langhear_count_pct_baseline langhear_colors_pct_baseline
	finemotor_blocks_pct_baseline finemotor_draw_pct_baseline finemotor_order_pct_baseline";

* first get unweighted differences for comparison;
orth_out `X' using `output'\gambia_ecd_edcc_tableA4-5.xls, by(experiment) se vce(cluster settlement_code) 
	compare test count colnum covar(region2) title("unweighted differences") replace;	
	
local replace "replace";
foreach x in `X' {;
	/*get means and (clustered) standard errors by treatment status*/
	local stat ""No. clusters",e(N_clust)";
	qui reg `x' ECDAnnex communitybased_all [pw=w], nocons cluster(settlement_code);
	outreg ECDAnnex communitybased_all using `output'\gambia_ecd_edcc_tableA4-5a, nocons se noaster addstat(`stat') `replace';

	/*get difference between ECD Annex and community-based experiments, with corresponding p-value of difference*/
	local stat ""No. clusters",e(N_clust),"p: diff b/t ECD Annex and community-based ECD",prct";
	if ("`x'"=="region2") qui reg `x' ECDAnnex [pw=w], cluster(settlement_code);
	else qui reg `x' ECDAnnex region2 [pw=w], cluster(settlement_code);
	qui test ECDAnnex;
	scalar define prct=r(p);
	/*p-value adjusts for stratification by region*/
	qui reg `x' ECDAnnex [pw=w], cluster(settlement_code);
	outreg ECDAnnex using `output'\gambia_ecd_edcc_tableA4-5b, nocons se noaster addstat(`stat') `replace';
};

* REGRESSIONS: INVERSE P-SCORE WEIGHTED, NO CONTROLS;
* note there are four groups:
	1) pure control: no ECD services
	2) community-based ECD: ECD facility (shed), training for providers in new curriculum ("Evaluation chapter.docx", p. 14)
	3) ECD Annex control: new curriculum without training
	4) ECD Annex treated: new curriculum + training;
* make 2 types of comparisons (pure control as reference group in both):
	1) community-based ECD v. ECD Annex (combined treatment & control)
		y = b0 + b1*communitybased + b2*ECDAnnex + e
	2) community-based ECD v. ECD Annex treatment & ECD Annex control, separately
		y = b0 + b1*communitybased + b2*ECDAnnex_control + b3*ECDAnnex_treated + e;
* include region controls for stratification throughout;
* drop observations outside common support;
qui drop if common_support==0;

/*endline outcomes*/
local Y "zfinemotor_adj_endline zlanghear_adj_endline";
	
* 1) community-based ECD v. ECD Annex (combined treatment & control);
local replace "replace";
local stat ""No. clusters",e(N_clust),"diff(Annex v. community-based)",bECDA,"se(Annex v. community-based)",-seECDA,
	"pure control mean",y0,"pure control s.e.",-se0";
foreach y in `Y' {;
	qui reg `y' communitybased ECDAnnex region2 [pw=w], cluster(settlement_code);
	qui lincom ECDAnnex - communitybased;
	scalar define bECDA=r(estimate);
	scalar define seECDA=r(se);
	qui su `y' if purecontrol==1 [aw=w];
	scalar define y0=r(mean);
	scalar define se0=r(sd);
	outreg communitybased ECDAnnex using `output'\gambia_ecd_edcc_tableA4-5c, nocons se 3aster addstat(`stat') `replace';
};

* 2) community-based ECD v. ECD Annex treatment & control, separately;
local stat ""diff(Annex(C) v. community-based)",bECDAc,"se(Annex(C) v. community-based)",-seECDAc,
	"diff(Annex(T) v. community-based)",bECDAt,"se(Annex(T) v. community-based)",-seECDAt,
	"pure control mean",y0,"pure control s.e.",-se0";
foreach y in `Y' {;
	qui reg `y' communitybased ECDAnnex_control ECDAnnex_treated region2 [pw=w], cluster(settlement_code);
	qui lincom ECDAnnex_control - communitybased;
	scalar define bECDAc=r(estimate);
	scalar define seECDAc=r(se);
	qui lincom ECDAnnex_treated - communitybased;
	scalar define bECDAt=r(estimate);
	scalar define seECDAt=r(se);
	qui su `y' if purecontrol==1 [aw=w];
	scalar define y0=r(mean);
	scalar define se0=r(sd);
	outreg communitybased ECDAnnex_control ECDAnnex_treated using `output'\gambia_ecd_edcc_tableA4-5d, nocons se 3aster addstat(`stat') `replace';
};

* REGRESSIONS: INCLUDE CONTROLS FOR BASELINE OUTCOMES;
/*limit analysis to items with both baseline & endline measures*/
local Y "zfinemotor_adj zlanghear_adj";	
	
foreach y in `Y' {;
	qui replace `y'_baseline=. if in_baseline==0|in_endline_new==1;
	/*ECD Annex combined*/
	local stat ""No. clusters",e(N_clust),"diff(Annex v. community-based)",bECDA,"se(Annex v. community-based)",-seECDA,
		"pure control mean",y0,"pure control s.e.",-se0";
	qui reg `y'_endline communitybased ECDAnnex `y'_baseline region2 [pw=w], cluster(settlement_code);
	qui lincom ECDAnnex - communitybased;
	scalar define bECDA=r(estimate);
	scalar define seECDA=r(se);
	qui su `y'_endline if purecontrol==1 [aw=w];
	scalar define y0=r(mean);
	scalar define se0=r(sd);
	outreg communitybased ECDAnnex using `output'\gambia_ecd_edcc_tableA4-5c, nocons se 3aster addstat(`stat') `replace';
	
	/*ECD Annex treatment and control, separately*/
	local stat ""diff(Annex(C) v. community-based)",bECDAc,"se(Annex(C) v. community-based)",-seECDAc,
		"diff(Annex(T) v. community-based)",bECDAt,"se(Annex(T) v. community-based)",-seECDAt,
		"pure control mean",y0,"pure control s.e.",-se0";
	qui reg `y'_endline communitybased ECDAnnex_control ECDAnnex_treated `y'_baseline region2 [pw=w], cluster(settlement_code);
	qui lincom ECDAnnex_control - communitybased;
	scalar define bECDAc=r(estimate);
	scalar define seECDAc=r(se);
	qui lincom ECDAnnex_treated - communitybased;
	scalar define bECDAt=r(estimate);
	scalar define seECDAt=r(se);
	qui su `y'_endline if purecontrol==1 [aw=w];
	scalar define y0=r(mean);
	scalar define se0=r(sd);
	outreg communitybased ECDAnnex_control ECDAnnex_treated using `output'\gambia_ecd_edcc_tableA4-5d, nocons se 3aster addstat(`stat') `replace';
};
	
* CALCULATE LEE (2009) BOUNDS;
* command only allows pairwise comparison of treatments;
* continue to weight by inverse propensity score and control for stratification by region;
* focus only on overall MDAT scores;
* comparisons: 	1) ECD Annex (pooled) v. pure control
				2) ECD Annex (treatment) v. pure control
				3) ECD Annex (pooled) v. community-based ECD
				4) ECD Annex (treatment) v. community-based ECD
				5) ECD Annex (control) v. community-based ECD;
* compare with and without controls for baseline outcomes;
/*trim for common support*/
qui drop if common_support==0;

/*drop observations with no MDAT score*/
qui drop if in_baseline_mdat==0 & in_endline_mdat==0;

* without controlling for baseline outcome;
foreach y in finemotor langhear {;
	/*ECD Annex (pooled) v. pure control*/
	di "outcome=`y', ECD Annex (pooled) v. pure control";
	leebounds z`y'_adj_endline experiment if communitybased!=1 [pw=w], select(in_endline_mdat) tight(region2);
	
	/*ECD Annex (treatment) v. pure control*/
	di "outcome=`y', ECD Annex (treatment) v. pure control";
	leebounds z`y'_adj_endline experiment if communitybased!=1 & ECDAnnex_control!=1 [pw=w], select(in_endline_mdat) tight(region2);
	
	/*ECD Annex (pooled) v. community-based ECD*/
	di "outcome=`y', ECD Annex (pooled) v. community-based ECD";
	leebounds z`y'_adj_endline experiment if purecontrol!=1 [pw=w], select(in_endline_mdat) tight(region2);

	/*ECD Annex (treatment) v. community-based ECD*/
	di "outcome=`y', ECD Annex (treatment) v. community-based ECD";
	leebounds z`y'_adj_endline experiment if purecontrol!=1 & ECDAnnex_control!=1 [pw=w], select(in_endline_mdat) tight(region2);	
	
	/*ECD Annex (control) v. community-based ECD*/
	di "outcome=`y', ECD Annex (control) v. community-based ECD";
	leebounds z`y'_adj_endline experiment if purecontrol!=1 & ECDAnnex_treated!=1 [pw=w], select(in_endline_mdat) tight(region2);	
};	

local end=`"$S_TIME"'; 
di "`start'";
di "`end'";
log close;
